Introduction

Attempt to determine the top skills for a data scientist based on job listings for data scientist. Different text mining techniques are used to identify skills from the description section of the job listings.

# Libraries used
library(dplyr)
library(ggplot2)
library(ggraph)
library(igraph)
library(quanteda)
library(tidyr)
library(tidytext)
library(tidyverse)
library(topicmodels)

Process Input Data

Following the NASA case study example presented at https://www.tidytextmining.com/nasa.html for Term Frequency and Topic Modeling.

# Read in CSV file of the 10000 jobs listings for Data Scientist
jobs_df <- read.csv(file = 'data_scientist_united_states_job_postings_jobspikr.csv', stringsAsFactors = FALSE)

# For testing purposes, just the first n rows only
jobs_df <- jobs_df[1:2000,]

Job description is the most important field in the dataset for the data mining exercise. The field contains the complete write-up posted for the job listing.

jobs_desc <- tibble(id = jobs_df$uniq_id, 
                        desc = jobs_df$job_description)

jobs_desc %>% 
  select(desc) %>% 
  sample_n(5)
## # A tibble: 5 x 1
##   desc                                                                     
##   <chr>                                                                    
## 1 "Are you interested in expanding your career through experience and expo…
## 2 RESPONSIBILITIES: Kforce has a client seeking a Lead Data Scientist in C…
## 3 "Read what people are saying about working here. \n\nJob Type:\n\nReq ID…
## 4 "Read what people are saying about working here. Job Title\n\nHealthcare…
## 5 "Read what people are saying about working here. \n\nPosition Descriptio…

The job category provides some context of the job listing and will be used to capture important words per category.

jobs_cat <- tibble(id = jobs_df$uniq_id, 
                        category = jobs_df$category)

jobs_cat <- jobs_cat %>% filter(jobs_cat$category != "") %>% print(n=10)
## # A tibble: 1,643 x 2
##    id                               category          
##    <chr>                            <chr>             
##  1 3b6c6acfcba6135a31c83bd7ea493b18 Accounting/Finance
##  2 1c8541cd2c2c924f9391c7d3f526f64e Accounting/Finance
##  3 445652a560a5441060857853cf267470 biotech           
##  4 9571ec617ba209fd9a4f842973a4e9c8 Accounting/Finance
##  5 0ec629c03f3e82651711f2626c23cadb Accounting/Finance
##  6 80d64b46bc7c89602f63daf06b9f1b4c Accounting/Finance
##  7 9cd3ed78e5cac9e516ea41173de2c25f Computer/Internet 
##  8 53224da901548e7137bbb163d456ba6a Computer/Internet 
##  9 010b5d671896d26eba50948c7c337f94 Computer/Internet 
## 10 4a8b875d1acc4f560716561d699aa022 Computer/Internet 
## # … with 1,633 more rows

From the job_description, tokenize all the words and remove “stop_words” which are common words in the English language to allow for focus on meaningful words of the job listing.

# Use tidytext’s unnest_tokens() for the description field so we can do the text analysis.
# unnest_tokens() will tokenize all the words in the description field and create a tidy dataframe of the word by identifer

jobs_desc <- jobs_desc %>% 
  unnest_tokens(word, desc) %>% 
  anti_join(stop_words)
## Joining, by = "word"
jobs_desc
## # A tibble: 610,000 x 2
##    id                               word         
##    <chr>                            <chr>        
##  1 3b6c6acfcba6135a31c83bd7ea493b18 read         
##  2 3b6c6acfcba6135a31c83bd7ea493b18 people       
##  3 3b6c6acfcba6135a31c83bd7ea493b18 farmers      
##  4 3b6c6acfcba6135a31c83bd7ea493b18 join         
##  5 3b6c6acfcba6135a31c83bd7ea493b18 team         
##  6 3b6c6acfcba6135a31c83bd7ea493b18 diverse      
##  7 3b6c6acfcba6135a31c83bd7ea493b18 professionals
##  8 3b6c6acfcba6135a31c83bd7ea493b18 farmers      
##  9 3b6c6acfcba6135a31c83bd7ea493b18 acquire      
## 10 3b6c6acfcba6135a31c83bd7ea493b18 skills       
## # … with 609,990 more rows

Provide count in table form of the most common words in the job descriptions.

# Most common words in the description field
jobs_desc %>%
  count(word, sort = TRUE) %>% print(n=10)
## # A tibble: 16,753 x 2
##    word           n
##    <chr>      <int>
##  1 data       23515
##  2 experience 10467
##  3 business    5479
##  4 learning    4599
##  5 science     4423
##  6 analytics   3999
##  7 analysis    3681
##  8 team        3651
##  9 skills      3648
## 10 machine     3585
## # … with 1.674e+04 more rows

Applying lowercase to all the words to ensure different cases of the same word aren’t considered different.

# lowercase all the words just to make sure there's no redundancy
jobs_desc <- jobs_desc %>% 
  mutate(word = tolower(word))

Term Frequency

The term frequency times inverse document frequency (TF-IDF) is used to identify words that are especially important to a document within a collection of documents. The results are the most important words in the description fields as measured by TF-IDF, meaning the words are common but not too common.

  1. Calculate the TF-IDF
# Calculating tf-idf for the description fields

desc_tf_idf <- jobs_desc %>% 
  count(id, word, sort = TRUE) %>%
  ungroup() %>%
  bind_tf_idf(word, id, n)

desc_tf_idf %>% filter(n >= 10) %>%
  arrange(-tf_idf)
## # A tibble: 2,677 x 6
##    id                               word          n     tf   idf tf_idf
##    <chr>                            <chr>     <int>  <dbl> <dbl>  <dbl>
##  1 3060feb695b8e87923f8a58cf24fef40 nbsp         21 0.0868  5.11  0.444
##  2 5864a13f0123fbe568f87b25d5fd74df bull         21 0.0595  6.91  0.411
##  3 948b573aa265210fa845a2309201a079 beacon       13 0.0526  6.91  0.364
##  4 5864a13f0123fbe568f87b25d5fd74df nbsp         25 0.0708  5.11  0.362
##  5 948b573aa265210fa845a2309201a079 hill         13 0.0526  6.50  0.342
##  6 0ec629c03f3e82651711f2626c23cadb licensing    14 0.0601  5.65  0.340
##  7 919e0dff20e8ef9c673a2438861c3a24 munich       11 0.044   7.60  0.334
##  8 754bd14947b7e23f004405174c46622e scripps      12 0.0432  7.60  0.328
##  9 3fc6d7717e5b98369a49ecf9bd39d85b biztalk      13 0.0451  6.91  0.312
## 10 64968dba1885b3593bc60d21c6dd7b3f biztalk      13 0.0451  6.91  0.312
## # … with 2,667 more rows
  1. Combine the data frame of the TF_IDF of the job descriptions with the job categories.

The join is performed on the unique ID as key. Joining with the categories will identify the most important words from the job descriptions per job category.

# Join with the category
desc_tf_idf <- full_join(desc_tf_idf, jobs_cat, by = "id")

desc_tf_idf %>% 
  filter(!near(tf, 1)) %>%
  filter(category %in% jobs_cat$category) %>%
  arrange(desc(tf_idf)) %>%
  group_by(category) %>%
  distinct(word, category, .keep_all = TRUE) %>%
  top_n(8, tf_idf) %>% 
  ungroup() %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  ggplot(aes(word, tf_idf, fill = category)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~category, ncol = 3, scales = "free") +
  coord_flip() +
  labs(title = "Highest tf-idf words in job listing description fields",
       caption = "From jobpickr dataset",
       x = NULL, y = "tf-idf")

The resulting plot did not prove useful for identifying skills across all the job listings for data scientist. The plot does indicate which words are more common across that specific job category. The results demonstrate that job listings by category are likely posted by the same company or same recruiter and thus the same boilerplate description is often used across many job listings.

Topic Modeling

In order to peform topic modeling, a document term matrix is required.

  1. Calculate the word count by document ID. Each job description is considered a unique document by the job listing’s unique ID.
# 8.4 Topic Modeling

# Casting to a document-term matrix
word_counts <- jobs_desc %>%
  count(id, word, sort = TRUE) %>%
  ungroup()

word_counts %>% print(n=10)
## # A tibble: 381,507 x 3
##    id                               word          n
##    <chr>                            <chr>     <int>
##  1 3d8f0398cc54891ea8882148c5ccbc94 data         58
##  2 049784c976f14a08fd6ad657112ee142 data         56
##  3 97e787bb231a3947d4535d7caccc8a2b data         54
##  4 ad142ec84fe65c670d841b2209932020 data         53
##  5 65be41bc1431a3d7be4481500e43966c data         52
##  6 e5141eb0203ff1f338de433faa14581a based        52
##  7 e5141eb0203ff1f338de433faa14581a residence    52
##  8 2e3f084a7c9189ae5656a4e253e1878f data         51
##  9 38e1881e6ae9ed06bced92e784ce2216 data         50
## 10 5deab449ec2f808c4947a009aa1db19a data         50
## # … with 3.815e+05 more rows
  1. Construct the document-term matrix.

The resulting document-term matrix indicates a high level of sparsity. The non-zero entries do correspond to a certain word appearing in a particular document.

# Construct DTM
desc_dtm <- word_counts %>%
  cast_dtm(id, word, n)

desc_dtm
## <<DocumentTermMatrix (documents: 1998, terms: 16753)>>
## Non-/sparse entries: 381507/33090987
## Sparsity           : 99%
## Maximal term length: 60
## Weighting          : term frequency (tf)
  1. Calculate the LDA

According to Wikipedia, “In natural language processing, the latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.”

# Rrunning this model is time intensive
# Define there to be 16 topics.
desc_lda <- LDA(desc_dtm, k = 16, control = list(seed = 1234))
desc_lda
## A LDA_VEM topic model with 16 topics.
  1. Tidy the resulting LDA topics.
# Interpreting the data model
tidy_lda <- tidy(desc_lda)

tidy_lda
## # A tibble: 268,048 x 3
##    topic term    beta
##    <int> <chr>  <dbl>
##  1     1 data  0.0314
##  2     2 data  0.0274
##  3     3 data  0.0167
##  4     4 data  0.0530
##  5     5 data  0.0648
##  6     6 data  0.0458
##  7     7 data  0.0379
##  8     8 data  0.0287
##  9     9 data  0.0475
## 10    10 data  0.0270
## # … with 268,038 more rows
  1. Identify the top 10 terms for each topic.
# Top 10 Terms
top_terms <- tidy_lda %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms
## # A tibble: 160 x 3
##    topic term          beta
##    <int> <chr>        <dbl>
##  1     1 data       0.0314 
##  2     1 scientist  0.0177 
##  3     1 experience 0.0151 
##  4     1 job        0.0130 
##  5     1 skills     0.00973
##  6     1 location   0.00902
##  7     1 python     0.00789
##  8     1 healthcare 0.00741
##  9     1 strong     0.00595
## 10     1 ca         0.00581
## # … with 150 more rows
  1. Plot the top 10 terms for each topic.

Even though the topics are anonymous, only identified by number, the groupings of words show some similarities and differences, but do not necessarily provide much value at this point.

The topic modeling process has identified groupings of terms that we can understand as human readers of these description fields.

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 4, scales = "free")

  1. Calculate gamma

Gamma will define the probability that each document belongs in each topic.

# LDA gamma
lda_gamma <- tidy(desc_lda, matrix = "gamma")

lda_gamma
## # A tibble: 31,968 x 3
##    document                         topic     gamma
##    <chr>                            <int>     <dbl>
##  1 3d8f0398cc54891ea8882148c5ccbc94     1 0.0000274
##  2 049784c976f14a08fd6ad657112ee142     1 0.0000340
##  3 97e787bb231a3947d4535d7caccc8a2b     1 0.0000275
##  4 ad142ec84fe65c670d841b2209932020     1 0.0000605
##  5 65be41bc1431a3d7be4481500e43966c     1 0.0000313
##  6 e5141eb0203ff1f338de433faa14581a     1 1.000    
##  7 2e3f084a7c9189ae5656a4e253e1878f     1 0.0767   
##  8 38e1881e6ae9ed06bced92e784ce2216     1 0.0000264
##  9 5deab449ec2f808c4947a009aa1db19a     1 0.0000467
## 10 ea38905f0f2700d82861bb12d384b378     1 0.0000382
## # … with 31,958 more rows
  1. Identify the categories associated with each topic
lda_gamma <- full_join(lda_gamma, jobs_cat, by = c("document" = "id"))

lda_gamma
## # A tibble: 32,000 x 4
##    document                      topic     gamma category                  
##    <chr>                         <int>     <dbl> <chr>                     
##  1 3d8f0398cc54891ea8882148c5cc…     1 0.0000274 Arts/Entertainment/Publis…
##  2 049784c976f14a08fd6ad657112e…     1 0.0000340 Sales                     
##  3 97e787bb231a3947d4535d7caccc…     1 0.0000275 Computer/Internet         
##  4 ad142ec84fe65c670d841b220993…     1 0.0000605 Insurance                 
##  5 65be41bc1431a3d7be4481500e43…     1 0.0000313 Manufacturing/Mechanical  
##  6 e5141eb0203ff1f338de433faa14…     1 1.000     Manufacturing/Mechanical  
##  7 2e3f084a7c9189ae5656a4e253e1…     1 0.0767    math jobs                 
##  8 38e1881e6ae9ed06bced92e784ce…     1 0.0000264 government                
##  9 5deab449ec2f808c4947a009aa1d…     1 0.0000467 Computer/Internet         
## 10 ea38905f0f2700d82861bb12d384…     1 0.0000382 Manufacturing/Mechanical  
## # … with 31,990 more rows
top_cats <- lda_gamma %>% 
  filter(gamma > 0.5) %>% 
  count(topic, category, sort = TRUE)

top_cats <- top_cats %>% filter(!is.na(category))

Topic 9 identifes ‘business and financial operations’ as the top category, and the only topic to include the term ‘aws’. Topic 4, most identified with category ‘Arts/Entertainment/Publishing’ contains the terms ‘experience’ and ‘content’ which align with the category broadly.

# One more graph from 8.4.4
top_cats %>%
  group_by(topic) %>%
  top_n(5, n) %>%
  ungroup %>%
  mutate(category = reorder_within(category, n, topic)) %>%
  ggplot(aes(category, n, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  labs(title = "Top categories for each LDA topic",
       x = NULL, y = "Number of documents") +
  coord_flip() +
  scale_x_reordered() +
  facet_wrap(~ topic, ncol = 2, scales = "free")

Lexical Dispersion

Following the example at https://www.r-bloggers.com/advancing-text-mining-with-r-and-quanteda/ and https://quanteda.io/articles/pkgdown/examples/plotting.html

  1. Create a corpus based on the unique ID and the job description.
# Generate a corpus

uniq_jobs_df <- jobs_df %>% distinct(uniq_id, .keep_all = TRUE)

my_corpus <- corpus(uniq_jobs_df, docid_field = "uniq_id", text_field = "job_description")

mycorpus_stats <- summary(my_corpus)
  1. Preprocess the text.

Remove numbers, remove punctuation, remove symbols, remove URLs, split hyphens. Clean for OCR.

# Preprocess the text

# Create tokens
token <-
  tokens(
    my_corpus,
    remove_numbers  = TRUE,
    remove_punct    = TRUE,
    remove_symbols  = TRUE,
    remove_url      = TRUE,
    split_hyphens   = TRUE
  )

# Clean tokens created by OCR
token_ungd <- tokens_select(
  token,
  c("[\\d-]", "[[:punct:]]", "^.{1,2}$"),
  selection = "remove",
  valuetype = "regex",
  verbose = TRUE
)
## removed 1,737 features
  1. Create a Data Frequency Matrix

Using the Quanteda library, create the data frequency matrix and filter words that appear less than 7.5% and more than 90%.

# Data frequency matrix
my_dfm <- dfm(token_ungd,
              tolower = TRUE,
              stem = TRUE,
              remove = stopwords("english")
              )

my_dfm_trim <-
  dfm_trim(
    my_dfm,
    min_docfreq = 0.075,
    # min 7.5%
    max_docfreq = 0.90,
    # max 90%
    docfreq_type = "prop"
  )

head(dfm_sort(my_dfm_trim, decreasing = TRUE, margin = "both"),
     n = 10,
     nf = 10)
## Document-feature matrix of: 10 documents, 10 features (4.0% sparse) and 20 docvars.
##                                   features
## docs                               work analyt busi develop model learn
##   0f4fc982e72ca26aaad1f66637019652   15     10    6      10     6     6
##   9f2a8aee7e7a102720cfb3dbc39c6c24   11      2    0      28     4    18
##   6737d173f0bdf3d4d8fd3543518e40c7   11      2    0      30     4    18
##   3fcf1d237cf41524a247f9b940393b95   20     40   22      18    18     4
##   1ed541e018a0ff4788fcb7adbece18b2   15     18   24      16    24     8
##   7aa14f095302ee73feaa651c7430a0ad   10      9    2      16     6     6
##                                   features
## docs                               team scienc statist requir
##   0f4fc982e72ca26aaad1f66637019652   12     12      10     18
##   9f2a8aee7e7a102720cfb3dbc39c6c24   18     10      10     16
##   6737d173f0bdf3d4d8fd3543518e40c7   18     10      10     16
##   3fcf1d237cf41524a247f9b940393b95   18      0      12      4
##   1ed541e018a0ff4788fcb7adbece18b2   14      4      10     12
##   7aa14f095302ee73feaa651c7430a0ad    1      9       5     30
## [ reached max_ndoc ... 4 more documents ]
  1. Plot lexical dispersion

Plot shows the occurrences of the term ‘python’ and ‘r’ in across all documents for the state of Oregon (OR). The state was chosen to give a natural subset of all documents initially included.

The lexical dispersion appears to indicate the use of the terms ‘python’ and ‘r’ often occur in conjunction in the documents (job descriptions) which would indicate the listings are listing the two programming languages in or near the same sentence.

my_corpus_sub <- corpus_subset(my_corpus, state == "OR")

theme_set(theme_bw())

g <- textplot_xray(
     kwic(my_corpus_sub, pattern = "python"),
     kwic(my_corpus_sub, pattern = "r")
)

g + aes(color = keyword) + 
    scale_color_manual(values = c("blue", "red")) +
    theme(legend.position = "none")

Bigrams

From the tutorial at https://www.tidytextmining.com/. The bigrams identify the word pairs that occur the most frequently

  1. Identify bigrams of n=2.

This exercise finds bigrams of two words. The function does allow for bigrams of greater than two.

jobs_bigrams <- jobs_df %>%
  unnest_tokens(bigram, job_description, token = "ngrams", n = 2)

jobs_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 178,989 x 2
##    bigram               n
##    <chr>            <int>
##  1 machine learning  3501
##  2 data scientist    3193
##  3 of the            2970
##  4 data science      2938
##  5 experience with   2794
##  6 in the            2598
##  7 ability to        2555
##  8 experience in     2035
##  9 in a              1928
## 10 years of          1768
## # … with 178,979 more rows
bigrams_separated <- jobs_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

# This result is valuable
bigram_counts %>% print(n = 20)
## # A tibble: 93,532 x 3
##    word1         word2           n
##    <chr>         <chr>       <int>
##  1 machine       learning     3501
##  2 data          scientist    3193
##  3 data          science      2938
##  4 computer      science      1033
##  5 data          sets          950
##  6 data          analysis      925
##  7 equal         opportunity   860
##  8 data          mining        823
##  9 national      origin        710
## 10 data          scientists    662
## 11 veteran       status        658
## 12 communication skills        627
## 13 sexual        orientation   623
## 14 opportunity   employer      621
## 15 data          analytics     594
## 16 data          sources       593
## 17 color         religion      561
## 18 gender        identity      553
## 19 race          color         545
## 20 qualified     applicants    536
## # … with 9.351e+04 more rows
  1. Filter the bigrams

Include bigrams that occurred at least 250 times in order to filter out visual noise.

# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 250) %>%
  graph_from_data_frame()

bigram_graph
## IGRAPH 418ed33 DN-- 73 53 -- 
## + attr: name (v/c), n (e/n)
## + edges from 418ed33 (vertex names):
##  [1] machine      ->learning    data         ->scientist  
##  [3] data         ->science     computer     ->science    
##  [5] data         ->sets        data         ->analysis   
##  [7] equal        ->opportunity data         ->mining     
##  [9] national     ->origin      data         ->scientists 
## [11] veteran      ->status      communication->skills     
## [13] sexual       ->orientation opportunity  ->employer   
## [15] data         ->analytics   data         ->sources    
## + ... omitted several edges
  1. Visualize the bigrams in Network plot
set.seed(2020)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

  1. Visualize the bigrams with Directional plot
set.seed(2021)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Frequency Plot

Simple bar plot of the most common words across all the job listings’ descriptions.

jobs_words <- jobs_df %>%
  unnest_tokens(word, job_description) %>%
  anti_join(stop_words) %>%
  count(uniq_id, word, sort = TRUE)
## Joining, by = "word"
total_words <- jobs_words %>% 
  group_by(uniq_id) %>% 
  summarize(total = sum(n))

jobs_words <- left_join(jobs_words, total_words)
## Joining, by = "uniq_id"
jobs_words <- jobs_words %>%
    anti_join(stop_words)
## Joining, by = "word"
jobs_words %>%
  count(word, sort = TRUE)
## # A tibble: 16,753 x 2
##    word           n
##    <chr>      <int>
##  1 data        1971
##  2 experience  1836
##  3 python      1541
##  4 scientist   1523
##  5 learning    1464
##  6 science     1445
##  7 team        1410
##  8 skills      1398
##  9 machine     1371
## 10 business    1353
## # … with 16,743 more rows
jobs_words %>%
  count(word, sort = TRUE) %>%
  filter(n > 500) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

Conclusion